*************************************************************************
*************************************************************************
**** Cleaning MORG data from 1979 onwards
*************************************************************************
*************************************************************************


/*************************************************************************
Builds on code written by Andres Santos and Zheng Fang. 
Used to process the MORG files as needed to analyze the inequality measures 
*************************************************************************/

clear all
macro drop _all
global input  "/Users/jvogel/Dropbox/JvogelPrivateWork/Cannonical_Model_MW/Replication/Data/Input/Data_NBER_MORG/"
global output "/Users/jvogel/Dropbox/JvogelPrivateWork/Cannonical_Model_MW/Replication/Data/Output/MORGcleaned/"

* Start processing year by year
forval y = 79(1)120 {    

 * Load the file for this year
 use "${input}/morg`y'", clear
 
 * Keep the right ages
 keep if age>=16 & age<=64

 * Get the earnings weight variable
 drop weight
 gen weight = earnwt/12

 * Get a dummy for men from the MORG sex variable
 gen male = (sex==1)
        
 * Back out an employment status variable. For our purposes main thing is
 * = 1 if working, and = 2 if working but without a job
 if (89<=`y' & `y'<=93) {
  gen esr = lfsr89  
 }
 if (`y' >=94) {
  gen esr = lfsr94
 }
 
 * Get the usual hours worked - here following Autor, Katz & Kearny
 rename hourslw hours
  
 * Finally adjust, if ESR = 2, means employed but not working
 replace hours = 0 if esr==2
  
 * Hours variable should between 0 and 99, adjust just in case
 replace hours = 99 if hours > 99 & hours~=.
 replace hours = 0  if hours < 0    
  
 * We'll deal next with education - Meant to be highest grade completed
 if (`y'>=79 & `y'<=91) {
  * Get the variables we'll need for this calculation
  rename gradeat at     // The highest grade attendend
  rename gradecp cp     // = 1 if highest grade attended was completed
     
  * Create the education variable
  gen educ = cp*at + (1-cp)*(at-1)
  replace educ = 0 if educ<0
 }        
 if `y' > 91 {
  * The coding of the education variable changes post 91. Here we
  * group variables towards an implementation as in Lemieux 2006
  gen educ = grade92
  replace educ=0 if educ==31		// Grade92 31 means 1st grade not completed
  replace educ=2.5 if educ==32		// Grade92 32 means 1st to 4th grade completed
  replace educ=5.5 if educ==33		// Grade92 33 means 5th to 6th grade completed
  replace educ=7.5 if educ==34     	// Grade92 34 means 7th and 8th grade completed
  replace educ=9 if educ==35 		// Grade92 35 means 9th grade
  replace educ=10 if educ==36 		// Grade92 36 means 10th grade
  replace educ=11 if educ==37 		// Grade92 37 means 11th grade
  replace educ=11 if educ==38 		// Grade92 38 means 12th grade but no diploma
  replace educ=12 if educ==39 		// Grade92 39 means high school degree or GED
  replace educ=13 if educ==40 		// Grade92 40 means some college, 41 vocational degree, 42 associate degree
  replace educ=13 if educ==41 		// Grade92 40 means some college, 41 vocational degree, 42 associate degree
  replace educ=13 if educ==42 		// Grade92 40 means some college, 41 vocational degree, 42 associate degree
  replace educ=16 if educ==43 		// Grade92 43 means college degree
  replace educ=18 if educ>43 		// Grade 92 larger than 43 is a form of graduatae degree     
 }
 gen edu=.
 replace edu=1 if educ<12
 replace edu=2 if educ==12
 replace edu=3 if educ>12 & educ<16
 replace edu=4 if educ==16
 replace edu=5 if educ>16

 **********WAGES

 * Here try to follow Lemieux and Autor, Katz and Kearny reasonably closely
 * FIRST: Start with a consistent weekly wage data for this group
 
 * Between 1989 and 1993 the unedited series has slightly higher topcode (this replaces nothing)
 if (`y'>=89 & `y'<=93) {
  gen change=(earnwke==1923 & uearnwk~=.)
  replace earnwke = uearnwk if change==1 & uearnwk>earnwke
 }
  
 * Adjust for topcoding, using the 1.5 multiplier at right threshold - for weekly
 if (`y'>=79 & `y'<=88) {
  replace earnwke =1.5*999 if earnwke == 999
 }
 if (`y'>=89 & `y'<=93) {
  replace earnwke = 1.5*1923 if earnwke == 1923
  replace earnwke = 1.5*1999 if change==1 & uearnwk==1999
 }     
 if (`y'>=94 & `y'<=97) {
  replace earnwke = 1.5*1923 if earnwke==1923
 }   
 if `y'>=98 {
  * This range adjustment is due to the censoring sometimes including decimals
  *replace earnwke = 1.5*2884 if earnwke >= 2884 & earnwke <=2885
  replace earnwke = 1.5*2884 if earnwke >= 2884 & earnwke~=.
 }
 
 * SECOND: Start with a consistent hourly wage data for this group
 * NOTE: Hourly wages are in pennies but weekly wages in dollars  
 gen wagehr = earnhre/100
  
 * Adjust for topcoding. This is challenging because the top coding is
 * not always applied. We follow the description in NBER notes
 if (`y'>=79 & `y'<= 84) {
  * During this time period, top coding is just at 99.99 per hour (but the numbers aren't exact)
  gen top1 = (wagehr >= 99.98 & wagehr~=.)
  replace wagehr = 1.5*wagehr if top1==1
 } 
 if (`y' >= 85 & `y'<= 97) {
  * During this time period we have top coding at 99.99 per hour
  gen top1 = (wagehr >= 99.98  & wagehr~=.)
  * But also top coding if total income is too high (usual hours can be -4 for "vary" starting in 1994; these will not be addressed in top2)
  * They let a few not be top coded. I top code them myself for consistency
  * Alternative is to keep these...
  gen tot = wagehr*uhourse
  *gen top2 = (tot >= 1923 & tot <= 1924)
  gen top2 = (tot >= 1923 & tot~=.)
  * Make sure people are not double flagged
  replace top2 = 0 if top1 == 1
  * And finally multiply by 1.5 times top code
  replace wagehr = 1.5*wagehr if top1==1
  replace wagehr = 1.5*wagehr if top2==1
 }
 if `y' >= 98 {
  * During this time period we have top coding at 99.99 per hour
  gen top1 = (wagehr >= 99.98 & wagehr~=.)
  * But also top coding if total income is too high (same as above here)
  gen tot = wagehr*uhourse   
  *gen top2 = (tot >= 2884 & tot <= 2885)
  gen top2 = (tot >= 2884 & tot~=.)
  * Make sure people are not double flagged
  replace top2 = 0 if top1 == 1
  * And finally multiply by 1.5 times top code
  replace wagehr = 1.5*wagehr if top1==1
  replace wagehr = 1.5*wagehr if top2==1
 }

 * Autor Katz and Kearny find some negative values, so we follow their checks
 replace earnwke=. if earnwke<0
 replace wagehr=. if wagehr<0

 * Get the indicator for whether hourly worker - Note original variable
 * has a 1 for hourly and a 2 for not hourly, hence the adjustment below
 replace paidhre = (paidhre == 1 & paidhre~=.)

 * Finally combine the series into a single Wage Series
 gen wage=.
 replace wage=wagehr if paidhre==1
 replace wage=earnwke/hours if paidhre~=1

 * And use the new WAGE together with hours (per week) to get back to income
 gen inc=wage*hours
 
 
 **********ALLOCATION FLAGS
 
 gen alloc=0
 
 if `y'~=94 {
  replace I25c=0 if I25c==.
  replace I25d=0 if I25d==.
 }
 replace paidhre=0 if paidhre==.
 
 * For all years, need the nonzeros as in Autor, Kearny and Katz
 if (`y'~=94 & `y'~=95) {
  replace alloc=1 if (paidhre == 1 & I25c > 0)		// If allocated and using hourly wages, flag
  replace alloc=1 if (paidhre ~= 1 & I25d > 0)		// If allocated and using weekly wages, flag
 }
 * For some years, must do more
 if `y' == 94 {
  * In 1994 the flag is missing, following Lemieux we declare all allocated
  replace alloc = 1
 }
 if `y' == 95 {
  * In 1995 they forgot the flag until September so set as allocated
  replace alloc=1 if intmonth <= 8
  replace alloc=1 if (paidhre == 1 & I25c > 0)		// If allocated and using hourly wages, flag
  replace alloc=1 if (paidhre ~= 1 & I25d > 0)		// If allocated and using weekly wages, flag
 }
 if (1989<=`y' & `y'<=1993) {
  * Following Hirch and Schummacher, missing observations were inputed, but flag is faulty. 
  * Notice them by comparing edited vs nonedited earnings to find them      
  * Flag anybody with missing unedited earnigns
  replace alloc=1 if (paidhre == 1 & earnhr==.)
  replace alloc=1 if (paidhre ~= 1 & uearnwk==.)
 }

 **********SAMPLE SELECTION (done in steps for clarity)
 
 gen use=1
 
 *Keep the unallocated
 replace use=0 if (alloc==1)
  
 * Keep employed obsservations only
 
 * Following Autor Katz and Kearny do two passes
 * Part I, we use the class variable to detect
 gen emp=0
 if (79<=`y' & `y'<=88) {
  * NOTE: Autor, Katz and Kearny drop <=3, we drop <=2 to exclude self-employed
  replace emp = 1 if (1<=classer & classer <=2)
 }
 if (89<=`y' & `y'<=93) {
  * NOTE: Autor, Katz and Kearny drop <=6, we drop <=4 to exclude self-employed
  replace emp = 1 if (1<=classer2 & classer2 <=4)
 }
 if `y' > 93 {
  * NOTE: Autor, Katz and Kearny drop <=7, we drop <=5 to exclude self-employed
  replace emp = 1 if (1<=class94 & class94 <=5)
  * In addition, class94 has missing for never worked, so adjust
  replace emp = 0 if class94==.
 }
  * Part II - we use the employment records variable to detect unemployment 
  replace emp=0 if esr>2
  * Putting Parts I and II together:
  replace use=0 if emp~=1

  * Keep thoses without missing wage or education
  replace use=0 if (wage==. | edu==.)

  * Keep if works more than 0 hours (we will weight by them)
  replace use=0 if (hours==0 | hours==.)
  
  label variable use "use wage data if ==1"
  

 **********Saving the output
 
 save "${output}/wage`y'", replace
}
